* This Stata dofile is written to accompany the papers:
* A Leigh & P van der Eng, 'Inequality in Indonesia: What Can We Learn from Top Incomes?' (2009), Journal of Public Economics, 93(1-2): 209-212
* A Leigh & P van der Eng, 'Top Incomes in Indonesia 1920-2004' in A.B. Atkinson and T. Piketty (eds) (2009) Top Incomes Over the Twentieth Century: Volume II - A Global Perspective, Oxford, Oxford University Press
* It follows an Excel routine devised by Tony Atkinson.
* Feel free to use or adapt it, but please cite those papers.
* Questions to andrew_leigh@ksg02.harvard.edu

clear
set more off
program drop _all
cd "C:\Users\Andrew\My publications\Indonesian Top Incomes\"

****************************************
* topinc program 
****************************************

program define topinc1

 * Renaming
 ren batas1 band_start
 ren batas2 band_end
 ren jmwp persons
 ren pk12 income

 ren households population

 * Some bands have zero persons, so we need to drop them 
 drop if persons==0

 sort band_start
 gen n=_n
 gen N=_N
 gsort -n
 gen rn=_n
 tsset rn

 * The control totals are in billions of Rupiah. We adjust accordingly.
 replace personalincome=personalincome*1000000000

 * The control totals are in trillions of Rupiah. We adjust accordingly.
 * replace personalincome=personalincome*1000000000000
 *replace income=income/1000000000000
 *replace band_start=band_start/1000000000000

 * A few checks
 egen temp1=sum(income)
 gen declaredincshare=temp1/personalincome
 egen temp2=sum(persons)
 gen taxpayershare=temp2/population
 gen meantaxpayerincome=temp1/temp2
 sum declaredincshare taxpayershare meantaxpayerincome
 drop temp*
 
 for any population personalincome: egen temp=max(X) \ replace X=temp \ drop temp
 gen meanincome=(personalincome)/population
 gen pctfreq=(persons/population)*100
 gen cumpctfreq=pctfreq if rn==1
 replace cumpctfreq=pctfreq+l.cumpctfreq if rn>1
 gen totalincome=(income/personalincome)*100
 gen cumtotalincome=totalincome if rn==1
 replace cumtotalincome=totalincome+l.cumtotalincome if rn>1

gen lowerlim:"Lower limit relative to mean"=band_start/meanincome
gen cellmean=(totalincome/pctfreq)*meanincome
gen cellmeanrel:"Cell mean relative to mean"=cellmean/meanincome
tsset n
gen midptmean=(lowerlim+f.lowerlim)/2-cellmeanrel if n>1
gen alphacalc:"Alpha calc from F"=ln(cumpctfreq/f.cumpctfreq)/ln(f.lowerlim/lowerlim) if n>1
gen hupper=(cumpctfreq*2*(cellmeanrel-lowerlim)+f.cumpctfreq*(lowerlim+f.lowerlim-2*cellmeanrel))/(f.lowerlim-lowerlim) if n>1
gen hmeansplit=(cumpctfreq*(cellmeanrel-lowerlim)+f.cumpctfreq*(f.lowerlim-cellmeanrel))/(f.lowerlim-lowerlim) if n>1

* LINEAR BOUNDS - LOWER
for any 10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen lowerX=cumtotalinc+cellmeanrel*(Y-cumpctfreq) if cumpctfreq>Y & f.cumpctfreq<=Y

* LINEAR BOUNDS - UPPER
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen upperX=min((cumtotalinc+lowerlim*(Y-cumpctfreq)),(f.cumtotalinc+f.lowerlim*(Y-f.cumpctfreq))) if cumpctfreq>Y & f.cumpctfreq<=Y

* REFINED BOUNDS - LOWER RESTRICTED BOUND FOR SHARE
gen lowerdensity=pctfreq/(2*(cellmeanrel-lowerlim)) if n>1
for any 10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01: gen lboundX=cumtotalinc-(cumpctfreq-Y)*(lowerlim+(cumpctfreq-Y)/(2*lowerdensity)) if lowerX~=.

* REFINED BOUNDS - UPPER RESTRICTED BOUND FOR SHARE
gen upperdensity= pctfreq*2*(cellmeanrel-lowerlim)/((f.lowerlim-lowerlim)^2) if n>1
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen uboundX=cumtotalinc-(cumpctfreq-Y)*(lowerlim) if lowerX~=. & Y>hupper
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : replace uboundX=f.cumtotalinc+(Y-f.cumpctfreq)*(f.lowerlim-(Y-f.cumpctfreq)/(2*upperdensity)) if lowerX~=. & Y<=hupper

* MEAN SPLIT HISTOGRAM 
gen msh_higherden=pctfreq*((f.lowerlim-cellmeanrel)/(cellmeanrel-lowerlim))/(f.lowerlim-lowerlim) if n>1
gen msh_lowerdensity=pctfreq*((cellmeanrel-lowerlim)/(f.lowerlim-lowerlim))/(f.lowerlim-cellmeanrel) if n>1
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen mshX=cumtotalinc-(cumpctfreq-Y)*(lowerlim+(cumpctfreq-Y)/(2*msh_h)) if lowerX~=. & Y>hmeansplit
for any  10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : replace mshX=f.cumtotalinc+(Y-f.cumpctfreq)*(f.lowerlim-(Y-f.cumpctfreq)/(2*msh_l)) if lowerX~=. & Y<=hmeansplit

* CALCULATING BAND CUTOFFS
gen index=1 if msh_higher>=msh_lower
recode index .=-1
for any 10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01 : gen cutoffX=meanincome*index*max((index*(lowerlim+(cumpctfreq-Y)/msh_higherden)),(index*(f.lowerlim-(Y-f.cumpctfreq)/msh_lower))) if lowerX~=.

* PARETO EXTRAPOLATION (results unconvincing)
*egen meanalpha=mean(alpha) if alpha<4
*for any 10 5 1 05 01 005 001 \ num 10 5 1 .5 .1 .05 .01: gen ystar=(cumpctfreq/Y)^(1/meanalpha)*band_start if n==5 \ gen paretoshareX=Y*meanalpha/(meanalpha-1)*ystar/meanincome \ drop ystar

* COLLAPSING AND POSTING RESULTS
for any  10 5 1 05 01 005 001: gen shareX=mshX
*for any  10 5 1 05 01 005 001: replace shareX=paretoshareX if shareX==.
egen taxpayers=sum(persons) 
collapse (max) year (max) share* (max) lower* (max) population (max) taxpayers (max) cutoff* (max) meanincome (max) declaredincshare (max) taxpayershare (max) meantaxpayerincome
post _1 (year) (share10) (share5) (share1) (share05) (share01) (share005) (share001) (cutoff10) (cutoff5) (cutoff1) (cutoff05) (cutoff01) (cutoff005) (cutoff001) (lower10) (lower5) (lower1) (lower05) (lower01) (lower005) (lower001) (population) (taxpayers) (meanincome) (declaredincshare) (taxpayershare) (meantaxpayerincome)
end

***************************************************************
* This next section runs the above program, and saves the results in shares-ext-1989.dta
***************************************************************

cd "C:\Users\Andrew\My publications\Indonesian Top Incomes\"

* Top income shares 1990-
#delimit ;
postfile _1 year share10 share5 share1 share05 share01 share005 share001 cutoff10 cutoff5 cutoff1 
cutoff05 cutoff01 cutoff005 cutoff001 lower10 lower5 lower1 lower05 lower01 lower005 lower001 
population taxpayers meanincome declaredincshare taxpayershare meantaxpayerincome using shares-ext-1989.dta, replace;
for num 1989/2003: 
use "C:\Users\Andrew\My publications\Indonesian Top Incomes\Individual files\individualX", clear \ 
gen year=X \ sort year \ merge year using controltotals-1989, nokeep \ 
 \ drop tahun _merge \ topinc1;
postclose _1;
#delimit cr

use shares-ext-1989.dta, clear
* Dropping non-credible estimates
replace share05=. if year>=2002 & year<=2003
replace share01=. if year>=1991 & year<=2001
sort year
save, replace

* How many bands are there?
for num 1989/2003: use "C:\Users\Andrew\My publications\Indonesian Top Incomes\Individual files\individualX",clear \ sum 

use shares-ext-1989.dta, clear
for var cutoff*: format X %16.0f
*for var cutoff*: replace X=X/1000000 \ format X %3.1f
STOP!

* Control totals
* Note that the wage bill is in trillions of Rupiah
use denominators-1989, clear
keep if year>=1989 & year<=2003
merge year using shares-ext-1989, keep(meanincome taxpayers)
*gen employees=population*.66
gen households=population/2
drop _merge population
replace meanincome=int(meanincome)
order year households taxpayers household meanincome

* Note: The employee data cannot be actual wages, since its sum exceeds the national wage bill, 
* and amounts to 80% of household consumption expenditure. Instead, what if the withholding 
* files are company-based? That would explain the smaller tax return totals, and the figures would denote
* the number of employees they employ. 

* In this case, we could either: 
* (a) use the totals as being total earnings for all employees (dodgy), or 
* (b) use only the individual files.



for var